---
title: "Y1 Study Skills Intervention - Data Cleaning and Analysis"
author: "Fernando Rodriguez"
date: "Last updated on 07-07-18 by FR"
output:
  html_document: default
  pdf_document: default
created on: 3-17-17
editor_options: 
  chunk_output_type: inline
---

```{r setup,}
knitr::opts_chunk$set(echo = TRUE)
```
# Step 1 - DATA IMPORTING AND CLEANING

```{r}
# Installing relevant packages
## Loading Libraries
library(splitstackshape)
library(RColorBrewer)
library(ggplot2)
library(reshape2)
library(plyr)
library(doBy)
library(dplyr)
library(psych)
library(gridExtra)
library(grid)
library(sjstats) 
library(fields)
library(fifer)
library(skimr) # for quickly summarizing data

```


## Importing Year 1 Data
```{r}
# Importing Year 1 Data
ssdata <- read.csv("Study Skills Y2 Raw Data.csv", header = TRUE)

```



## Demographic Variables - Checking and Cleaning
```{r}
# First gen status
table(ssdata$firstgeneration_rec)
# creating variable firstgeneration_rec by removing 9 (other/unknown)
ssdata$firstgeneration_rec2 <-NA
ssdata$firstgeneration_rec2[ssdata$firstgeneration_rec == 0] <- 0
ssdata$firstgeneration_rec2[ssdata$firstgeneration_rec == 1] <- 1
table(ssdata$firstgeneration_rec2, exclude = NULL)


# Low income status
table(ssdata$lowincomeflag_rec)

```

## Study Strategy  - Checking and Cleaning
```{r}
## Cleaning variables that capture the different study strategies 
# First, I assign NAs to the black colums
# pre-
table(ssdata$pre_studystrat)
ssdata$pre_studystrat[ssdata$pre_studystrat ==""]<-NA
head(ssdata$pre_studystrat)

#post-
head(ssdata$post_studystrat)
ssdata$post_studystrat[ssdata$post_studystrat ==""]<-NA
head(ssdata$post_studystrat)
```

## Ethnicity & URM Status - Checking and Cleaning
```{r}
# Ethnicity
str(ssdata$ethnicity_rec)
table(ssdata$ethnicity_rec)

table(ssdata$urm)
table(ssdata$urm, ssdata$section)

```

## Gender - Checking and Cleaning
```{r, echo = F}
# Gender
str(ssdata$gender_rec)
ssdata$gender_rec <- as.factor(ssdata$gender_rec)
str(ssdata$gender_rec)

ssdata$gender_rec <- factor(ssdata$gender_rec,
                  levels = c(0, 1, 9),
                  labels = c("Female", "Male", "Other/Unknown"))


# creating variable gender_rec2 by removing 9 (other/unknown)
table(ssdata$gender_rec)
ssdata$gender_rec2 <- NA
ssdata$gender_rec2[ssdata$gender_rec == "Female"] <- 0
ssdata$gender_rec2[ssdata$gender_rec == "Male"] <- 1
table(ssdata$gender_rec2)

ssdata$gender_rec2 <- factor(ssdata$gender_rec2,
                  levels = c(0, 1),
                  labels = c("Female", "Male"))
table(ssdata$gender_rec2)

str(ssdata$gender_rec2)
```

## Years at Institution - Checking and Cleaning
```{r, echo = F}


table(ssdata$firstregacadyr)

ssdata$years_rec[ssdata$firstregacadyr == "2016-17"] <- 1
ssdata$years_rec[ssdata$firstregacadyr == "2015-16"] <- 2
ssdata$years_rec[ssdata$firstregacadyr == "2014-15"] <- 3
ssdata$years_rec[ssdata$firstregacadyr == "2013-14"] <- 4
ssdata$years_rec[ssdata$firstregacadyr == "2012-13"] <- 5
ssdata$years_rec[ssdata$firstregacadyr == "2011-12"] <- 6
table(ssdata$years_rec)


# Boxplot of grades by first academic year
ggplot(ssdata, aes(x=years_rec, y=grade_rec, group = years_rec), na.omit = TRUE) + geom_boxplot() + stat_summary(fun.y=mean, geom="point", shape=5, size=4)  + 
  labs(x = "Years Enrolled", y = "Final Grade", title = "Final Grade by Years Enrolled") + 
  scale_y_continuous(limits = c(0, 13), breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) + theme(axis.text.x = element_text(angle = 60, hjust = 1))

```


## Splitting Study Strategies Question
```{r}
## Using `cSplit` in the `splitstackshape` package to split up the variable `pre_studystrat`
table(ssdata$pre_studystrat)[0:10]
# pre-survey
ssdata <- cSplit(ssdata, "pre_studystrat", ",")


# checking output
table(ssdata$pre_studystrat_01)
table(ssdata$pre_studystrat_02)
table(ssdata$pre_studystrat_03)
table(ssdata$pre_studystrat_04)
table(ssdata$pre_studystrat_05)
table(ssdata$pre_studystrat_06)
table(ssdata$pre_studystrat_07)
table(ssdata$pre_studystrat_08)
table(ssdata$pre_studystrat_09)
table(ssdata$pre_studystrat_10)

# post-survey
ssdata <- cSplit(ssdata, "post_studystrat", ",")

# checking output
table(ssdata$post_studystrat_01)
table(ssdata$post_studystrat_02)
table(ssdata$post_studystrat_03)
table(ssdata$post_studystrat_04)
table(ssdata$post_studystrat_05)
table(ssdata$post_studystrat_06)
table(ssdata$post_studystrat_07)
table(ssdata$post_studystrat_08)
table(ssdata$post_studystrat_09)
table(ssdata$post_studystrat_10)
table(ssdata$post_studystrat_11)
```


## Coding the different study strateiges into their own variables
## Top 3 strategies only
## 1 = self-test
```{r}

# pre- self-test
# creating a new list to name each of the study strategies variables (all selected)
prestudycol <- c("pre_studystrat_01", "pre_studystrat_02", "pre_studystrat_03")

str(prestudycol)
str(ssdata$pre_studystrat_01)

# creating the variable `pre_ss_selftest` to capture the value 1
# using the prestudycol list to pull the value 1
ssdata$pre_ss_selftest <- ((rowSums(ssdata[, prestudycol, with = FALSE] == 1, na.rm = T) > 0 )*1)

table(ssdata$pre_ss_selftest)

# post- self-test
# listing the variables (all selected)
poststudycol <- c("post_studystrat_01", "post_studystrat_02", "post_studystrat_03")

ssdata$post_ss_selftest <-((rowSums( ssdata[ , poststudycol, with = FALSE] == 1, na.rm = T) >0)*1)

table(ssdata$post_ss_selftest)
# checking frequencies
table(ssdata$post_ss_selftest, ssdata$section) 



```


## 2 = Use flashcards
```{r}

# pre- flashcards
# creating the variable `pre_ss_flashc` to capture the value 2
# using the prestudycol list to pull the value 2
ssdata$pre_ss_flashc <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 2, na.rm = T) >0)*1)


# post- flashcards
# creating the variable `post_ss_flashc` to capture the value 2
# using the prestudycol list to pull the value 2
ssdata$post_ss_flashc <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 2, na.rm = T) >0)*1)

# checking frequenices by coursection
table(ssdata$post_ss_flashc, ssdata$section) 
```



## 3 = Reread chapters, articles, notes, etc.
```{r}
# pre- reread
# creating the variable `pre_ss_reread` to capture the value 3
# using the prestudycol list to pull the value 3
ssdata$pre_ss_reread <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 3, na.rm = T) >0)*1)


# post- reread
# creating the variable `post_ss_reread` to capture the value 3
# using the prestudycol list to pull the value 3
ssdata$post_ss_reread <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 3, na.rm = T) >0)*1)

```


## 4 = Underlying or highlighting (high)
```{r}
# pre- high
# creating the variable `pre_ss_high`to capture the value 4
# using the prestudycol list to pull the value 4
ssdata$pre_ss_high <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 4, na.rm = T) >0)*1)

# post- high
# creating the variable `post_ss_high` to capture the value 4
# using the prestudycol list to pull the value 4
ssdata$post_ss_high <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 4, na.rm = T) >0)*1)

```


## 5 = Recopy notes
```{r}

# pre- recopy
ssdata$pre_ss_recopy <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 5, na.rm = T) >0)*1)

# post- recopy
ssdata$post_ss_recopy <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 5, na.rm = T) >0)*1)


```


## 6 = Condensing or summarizing notes
```{r}

# pre- condense
ssdata$pre_ss_condense <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 6, na.rm = T) >0)*1)



# post- condense
ssdata$post_ss_condense <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 6, na.rm = T) >0)*1)

```

## 7 = Recopy notes from memory

```{r}

# pre- recopymem
ssdata$pre_ss_recopymem <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 7, na.rm = T) >0)*1)


# post- recopymem
ssdata$post_ss_recopymem <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 7, na.rm = T) >0)*1)

```


## 8 = Make diagrams

```{r}

# pre- daig
ssdata$pre_ss_diag <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 8, na.rm = T) >0)*1)

# post- diag
ssdata$post_ss_diag <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 8, na.rm = T) >0)*1)

```


## 9 = Study with friends

```{r}
# pre- friends
ssdata$pre_ss_friends <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 9, na.rm = T) >0)*1)

# post- friends
ssdata$post_ss_friends <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 9, na.rm = T) >0)*1)

```



## 10 = Absobing info (cramming) night before
```{r}
# pre- cram
ssdata$pre_ss_cram <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 10, na.rm = T) >0)*1)

# post- cram
ssdata$post_ss_cram <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 10, na.rm = T) >0)*1)

```


## 11 = Watch videos
```{r}
# pre- videos
ssdata$pre_ss_videos <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 11, na.rm = T) >0)*1)

# post- videos
ssdata$post_ss_videos <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 11, na.rm = T) >0)*1)

```

## 12 = Other

```{r}

# pre- other
ssdata$pre_ss_other <-((rowSums(ssdata[ , prestudycol, with = FALSE] == 12, na.rm = T) >0)*1)

# post- other
ssdata$post_ss_other <-((rowSums(ssdata[ , poststudycol, with = FALSE] == 12, na.rm = T) >0)*1)

```


# Step 2 - DATA EXPORTING

## Exporting Y1 Pre-Survey Study Strat and URM status variables to csv file
This is done in order to merge these variables with the same Y2 variables for the following two manuscript result sections: 
1) study behaviors at week 1 (Table 2)
2) Percent of students’ study decisions (Table 3)
```{r}
studystratPre <- subset(ssdata, select = c(roster_randomid, urm, roster_status, pre_status,
                                           post_status,
                                           pre_studypat, post_studypat,
                                           pre_ss_selftest, post_ss_selftest,
                                           pre_ss_videos,
                                           pre_ss_condense, pre_ss_diag, pre_ss_high,
                                           pre_ss_flashc, pre_ss_friends, pre_ss_recopy,
                                           pre_ss_recopymem, pre_ss_other, pre_studydecide_chall, pre_studydecide_soon, pre_studydecide_cons, pre_studydecide_easy,
 pre_studydecide_imp, pre_studydecide_interest, pre_studydecide_sched,  pre_studydecide_other, post_studydecide_chall, post_studydecide_soon,
 post_studydecide_cons, post_studydecide_easy, post_studydecide_imp,
 post_studydecide_interest, post_studydecide_sched, post_studydecide_other, grade_rec))

studystratPre$year = 1
table(studystratPre$year)



write.csv(studystratPre, 'Year1studystrat.csv', row.names = FALSE)

```




# Step 3 - SAMPLE DESCRIPTIVES
## Demographics and Prior Academic Background
## (Table 1)
```{r}
# Section
table(ssdata$section)
        
# Age
skim(ssdata$age)
        
# Gender
table(ssdata$gender_rec)
        
# Ethnicity
table(ssdata$ethnicity_rec2)
        
# URM
table(ssdata$urm)
        
# First Gen
table(ssdata$firstgeneration_rec2)
        
# Low Income
table(ssdata$lowincomeflag_rec)
        
# years at UCI
table(ssdata$firstregacadyr)
        
# SAT total
skim(ssdata$sattotalscore)
        
# College Cumulative GPA
skim(ssdata$gpacumulative)
        
# Final Course Grade
skim(ssdata$grade_rec)
        
```


# Step 4 - DATA ANALYSIS AND VISUALIZATIONS
## Impact of the study skills intervention on spacing and self-testing
## Spacing Results
```{r, echo = F}
# obtaining proportions of the strategies students reported using at pre

# spacing vs. massing
table(ssdata$pre_studypat)
prop.table(table(ssdata$pre_studypat))


# comparing numbers to the study strategy - cramming
table(ssdata$pre_studypat, ssdata$pre_ss_cram)


# PRE
# creating a cramming score to include those who selected one or both vs. none

ssdata$pre_spacing_recode <-NA
ssdata$pre_spacing_recode[ssdata$pre_studypat == 0 & ssdata$pre_ss_cram == 1] <- 0 # cram, cram
table(ssdata$pre_spacing_recode)

ssdata$pre_spacing_recode[ssdata$pre_studypat == 1 & ssdata$pre_ss_cram == 1] <- 0 # space, cram
table(ssdata$pre_spacing_recode)

ssdata$pre_spacing_recode[ssdata$pre_studypat == 0 & ssdata$pre_ss_cram == 0] <- 0 # cram, cram not-selected
table(ssdata$pre_spacing_recode)

ssdata$pre_spacing_recode[ssdata$pre_studypat == 1 & ssdata$pre_ss_cram == 0] <- 1 # space, cram not-selected
table(ssdata$pre_spacing_recode) # new variable
table(ssdata$pre_studypat) # comparing to old variable


# proportion table of spacing vs. cramming
prop.table(table(ssdata$pre_spacing_recode)) # new variable

# overall t-test
t.test(ssdata$grade_rec~ssdata$pre_spacing_recode)

# chisquare by section
chisq.test(ssdata$section, ssdata$pre_spacing_recode)
chisq.test(ssdata$section, ssdata$post_spacing_recode)


# t-test for section a
ssdata_secA <- subset(ssdata, section == "Intervention Section")
t.test(ssdata_secA$grade_rec~ssdata_secA$pre_spacing_recode)

# t-test for section b
ssdata_secB <- subset(ssdata, section == "Control Section 1")
t.test(ssdata_secB$grade_rec~ssdata_secB$pre_spacing_recode)

# t-test for section c
ssdata_secC <- subset(ssdata, section == "Control Section 2")
t.test(ssdata_secC$grade_rec~ssdata_secC$pre_spacing_recode)



# POST
# doing it for post as well
ssdata$post_spacing_recode <-NA
ssdata$post_spacing_recode[ssdata$post_studypat == 0 & ssdata$post_ss_cram == 1] <- 0 # cram, cram
table(ssdata$post_spacing_recode)

ssdata$post_spacing_recode[ssdata$post_studypat == 1 & ssdata$post_ss_cram == 1] <- 0 # space, cram
table(ssdata$post_spacing_recode)

ssdata$post_spacing_recode[ssdata$post_studypat == 0 & ssdata$post_ss_cram == 0] <- 0 # cram, cram not-selected
table(ssdata$post_spacing_recode)

ssdata$post_spacing_recode[ssdata$post_studypat == 1 & ssdata$post_ss_cram == 0] <- 1 # space, cram not-selected
table(ssdata$post_spacing_recode) # new variable
table(ssdata$post_studypat) # comparing to old variable


# proportion table of spacing vs. cramming
prop.table(table(ssdata$post_spacing_recode)) # new variable

# overall t-test
t.test(ssdata$grade_rec~ssdata$post_spacing_recode)



# t-test for section a
ssdata_secA <- subset(ssdata, section == "Intervention Section")
t.test(ssdata_secA$grade_rec~ssdata_secA$post_spacing_recode)

# t-test for section b
ssdata_secB <- subset(ssdata, section == "Control Section 1")
t.test(ssdata_secB$grade_rec~ssdata_secB$post_spacing_recode)

# t-test for section c
ssdata_secC <- subset(ssdata, section == "Control Section 2")
t.test(ssdata_secC$grade_rec~ssdata_secC$post_spacing_recode)


```



## Descriptives of study strateiges and final course grades
## Pre (Not included manuscript, but included here for public reference)
```{r, echo = F}
# study strategies

# spacing vs. cramming
table(ssdata$pre_spacing_recode)
prop.table(table(ssdata$pre_spacing_recode))
t.test(ssdata$grade_rec~ssdata$pre_spacing_recode)


# self-test
table(ssdata$pre_ss_selftest)
prop.table(table(ssdata$pre_ss_selftest))
t.test(ssdata$grade_rec~ssdata$pre_ss_selftest)

# flashcards
prop.table(table(ssdata$pre_ss_flashc))
t.test(ssdata$grade_rec~ssdata$pre_ss_flashc)

# recopy mem
prop.table(table(ssdata$pre_ss_recopymem))
t.test(ssdata$grade_rec~ssdata$pre_ss_recopymem)

# re-read
prop.table(table(ssdata$pre_ss_reread))
t.test(ssdata$grade_rec~ssdata$pre_ss_reread)

# underlying or highlighting
prop.table(table(ssdata$pre_ss_high))
t.test(ssdata$grade_rec~ssdata$pre_ss_high)

# recopying notes
prop.table(table(ssdata$pre_ss_recopy))
t.test(ssdata$grade_rec~ssdata$pre_ss_recopy)

# condensing or summarizing notes
prop.table(table(ssdata$pre_ss_condense))
t.test(ssdata$grade_rec~ssdata$pre_ss_condense)

# make diagrams
prop.table(table(ssdata$pre_ss_diag))
t.test(ssdata$grade_rec~ssdata$pre_ss_diag)

# study with friends
prop.table(table(ssdata$pre_ss_friends))
t.test(ssdata$grade_rec~ssdata$pre_ss_friends)

# watch videos
prop.table(table(ssdata$pre_ss_videos))
t.test(ssdata$grade_rec~ssdata$pre_ss_videos)

# other
prop.table(table(ssdata$pre_ss_other))
t.test(ssdata$grade_rec~ssdata$pre_ss_other)


```


## Descriptives of study strateiges and final course grades
## Post 
## (Table 5)
```{r, echo = F}
# study strategies


# spacing vs. cramming
table(ssdata$post_spacing_recode)
prop.table(table(ssdata$post_spacing_recode))
describeBy(ssdata$grade_rec, ssdata$post_spacing_recode, mat = T)
ssdata %>% group_by(post_spacing_recode) %>% summarize(mean=mean(grade_rec), sd=sd(grade_rec))
t.test(ssdata$grade_rec~ssdata$post_spacing_recode)
t.test(ssdata$grade_rec~ssdata$post_spacing_recode, var.equal=TRUE)



# self-test 
prop.table(table(ssdata$post_ss_selftest))
describeBy(ssdata$grade_rec, ssdata$post_ss_selftest, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_selftest)

# flashcards
prop.table(table(ssdata$post_ss_flashc))
describeBy(ssdata$grade_rec, ssdata$post_ss_flashc, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_flashc)

# recopymem
prop.table(table(ssdata$post_ss_recopymem))
describeBy(ssdata$grade_rec, ssdata$post_ss_recopymem, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_recopymem)

# re-read
prop.table(table(ssdata$post_ss_reread))
describeBy(ssdata$grade_rec, ssdata$post_ss_reread, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_reread)

# underlying or highlighting
prop.table(table(ssdata$post_ss_high))
describeBy(ssdata$grade_rec, ssdata$post_ss_high, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_high)


# recopying notes
prop.table(table(ssdata$post_ss_recopy))
describeBy(ssdata$grade_rec, ssdata$post_ss_recopy, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_recopy)


# condensing or summarizing notes
prop.table(table(ssdata$post_ss_condense))
describeBy(ssdata$grade_rec, ssdata$post_ss_condense, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_condense)


# make diagrams
prop.table(table(ssdata$post_ss_diag))
describeBy(ssdata$grade_rec, ssdata$post_ss_diag, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_diag)


# study with friends
prop.table(table(ssdata$post_ss_friends))
describeBy(ssdata$grade_rec, ssdata$post_ss_friends, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_friends)



# watch videos
prop.table(table(ssdata$post_ss_videos))
describeBy(ssdata$grade_rec, ssdata$post_ss_videos, mat = T)
t.test(ssdata$grade_rec~ssdata$post_ss_videos)

# other
prop.table(table(ssdata$post_ss_other))
t.test(ssdata$grade_rec~ssdata$post_ss_other)

```



# Pre-and Post Comparions for Spacing
```{r}

# PRE descriptive information about proportion students selcting 'spaced' by section
table(ssdata$pre_spacing_recode)

table(ssdata$pre_spacing_recode, ssdata$section)

# recoding section as numeric values (Sec A = 0, B = 1, C = 2)
ssdata$section_numeric <- NA
ssdata$section_numeric[ssdata$section =="Intervention Section"] <- 0
ssdata$section_numeric[ssdata$section =="Control Section 1"] <- 1
ssdata$section_numeric[ssdata$section =="Control Section 2"] <- 2

# checking recode results
table(ssdata$section)
table(ssdata$section_numeric)

tb1 = table(ssdata$section_numeric, ssdata$pre_spacing_recode)
tb1

chisq.test(tb1)



# chisquare test for section
chisq.test(ssdata$section, ssdata$pre_spacing_recode)
chisq.test(ssdata$section, ssdata$post_spacing_recode)


# post-hoc

# Intervention vs Control 1
posthoc_secAvB <- subset(ssdata, section != "Control Section 2")
table(posthoc_secAvB$section)

chisq.test(posthoc_secAvB$section, posthoc_secAvB$post_spacing_recode)

# Intervention vs Control 2
posthoc_secAvC <- subset(ssdata, section != "Control Section 1")
table(posthoc_secAvC$section)

chisq.test(posthoc_secAvC$section, posthoc_secAvC$post_spacing_recode)


# chisquare test method 2

table(ssdata$post_spacing_recode)
table(ssdata$section_numeric, ssdata$post_spacing_recode)

tb2 = table(ssdata$section_numeric, ssdata$post_spacing_recode)
tb2
chisq.test(tb2)


```



# Self-Test Percentage Change
```{r}
# self-test pre
tb3 = table(ssdata$section_numeric, ssdata$pre_ss_selftest)
tb3
chisq.test(tb3)
chisq.test(ssdata$pre_ss_selftest, ssdata$section)

# self-test post
tb4 = table(ssdata$section_numeric, ssdata$post_ss_selftest)
tb4
chisq.test(tb4)

# Sec A change

```



## Spacing Overall Proportion
```{r}

# Pre-Post Differences

# basic line graph of pre- and post- spacing
sptab <- subset(ssdata, select=c(pre_spacing_recode, post_spacing_recode, section))

# removing NAs
sptab <- subset(sptab, pre_spacing_recode >=0 & post_spacing_recode >=0)


# Reshaping data using reshape2 library
spmelted <- melt(sptab, id.vars=c("section"))


# labeling variable to say "pre-" and "post"
spmelted$variable <- factor(spmelted$variable, labels = c("Pre", "Post"))


library(Rmisc)
spmelted <- summarySE(spmelted, measurevar = "value", groupvars = c("variable", "section"))




# Line graph of pre-post changes in spacing by section
spaceplot = ggplot(spmelted, aes(x=variable, y=value, group = section, colour = section)) +
    geom_line(size = 1.5) +  geom_point(size= 3, shape = 21, colour = "black", fill = "white") + coord_fixed(ratio = 3.5) + 
  labs(x = "Self-Reported Spacing", y = "Proportion of Students Selecting Spacing") + 
  scale_y_continuous(limits = c(0, 1), breaks = c(0, .2, .4, .6, .8, 1)) + 
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 15)) + 
  scale_colour_grey() +  labs(colour = "Section") 

spaceplot
```




```{r}
# PRE descriptive information about proportion of students selecting self-testing 
table(ssdata$pre_ss_selftest, ssdata$section)

tb3 = table(ssdata$section_numeric, ssdata$pre_ss_selftest)
tb3

chisq.test(tb3)


# A vs B
table(posthoc_secAvB$section)

chisq.test(posthoc_secAvB$section, posthoc_secAvB$post_ss_selftest)

# A v C
table(posthoc_secAvC$section)

chisq.test(posthoc_secAvC$section, posthoc_secAvC$post_ss_selftest)





# POST descriptive information about proportion students selcting 'spaced' by section

table(ssdata$post_ss_selftest)
table(ssdata$post_ss_selftest, ssdata$section)

tb4 = table(ssdata$section_numeric, ssdata$post_ss_selftest)
tb4

chisq.test(tb4)
```


# Self-teting Overall Proportion
```{r, echo = F}
# basic line graph of pre- and post- self-testing

sttab <- subset(ssdata, select=c(pre_ss_selftest, post_ss_selftest, section))


# removing NAs
sttab <- subset(sttab, pre_ss_selftest >=0 & post_ss_selftest >=0)


# Reshaping data using reshape2 library
stmelted <- melt(sttab, id.vars=c("section"))


# labeling variable to say "pre-" and "post"
stmelted$variable <- factor(stmelted$variable, labels = c("Pre", "Post"))


stmelted <- summarySE(stmelted, measurevar = "value", groupvars = c("variable", "section"))




# Line graph of pre-post changes in self-testing by section
selftestplot = ggplot(stmelted, aes(x=variable, y=value, group = section, colour = section)) +
    geom_line(size = 1.5) +  geom_point(size= 3, shape = 21, colour = "black", fill = "white") + coord_fixed(ratio = 3.5) + 
  labs(x = "Self-Reported Self-Testing", y = "Proportion of Students Selecting Self-Testing") + 
  scale_y_continuous(limits = c(0, 1), breaks = c(0, .2, .4, .6, .8, 1)) + 
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 15),
        axis.text.y = element_text(size = 6)) + 
  scale_colour_grey() +  labs(colour = "Section") 


grid.arrange(spaceplot, selftestplot, ncol = 2)

g <-grid.arrange(spaceplot, selftestplot, ncol = 2)

ggsave(file="Y2 spacing selftest.png", width = 8, height = 6, units = "in", g)

```






### Examining non-effective strategies
# Re-read
```{r, echo = F}
# Pre-Post Differences

# basic line graph of pre- and post- spacing
sptab <- subset(ssdata, select=c(pre_ss_reread, post_ss_reread, section))

# removing NAs
sptab <- subset(sptab, pre_ss_reread >=0 & post_ss_reread >=0)


# Reshaping data using reshape2 library
spmelted <- melt(sptab, id.vars=c("section"))


# labeling variable to say "pre-" and "post"
spmelted$variable <- factor(spmelted$variable, labels = c("Pre", "Post"))


library(Rmisc)
spmelted <- summarySE(spmelted, measurevar = "value", groupvars = c("variable", "section"))




# Line graph of pre-post changes in spacing by section # 
rereadplot = ggplot(spmelted, aes(x=variable, y=value, group = section, colour = section)) +
    geom_line(size = 1.5) +  geom_point(size= 3, shape = 21, colour = "black", fill = "white") + coord_fixed(ratio = 3.5) + 
  labs(x = "Self-Reported Re-Reading", y = "Proportion of Students Selecting Re-Reading") + 
  scale_y_continuous(limits = c(0, 1), breaks = c(0, .2, .4, .6, .8, 1)) + 
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 15)) + 
  scale_colour_grey() +  labs(colour = "Section") 

rereadplot

```


# Condense notes

```{r, echo = F}
# Pre-Post Differences

# basic line graph of pre- and post- spacing
sptab <- subset(ssdata, select=c(pre_ss_condense, post_ss_condense, section))

# removing NAs
sptab <- subset(sptab, pre_ss_condense >=0 & post_ss_condense >=0)


# Reshaping data using reshape2 library
spmelted <- melt(sptab, id.vars=c("section"))


# labeling variable to say "pre-" and "post"
spmelted$variable <- factor(spmelted$variable, labels = c("Pre", "Post"))


library(Rmisc)
spmelted <- summarySE(spmelted, measurevar = "value", groupvars = c("variable", "section"))




# Line graph of pre-post changes in spacing by section # spaceplot = 
condenseplot = ggplot(spmelted, aes(x=variable, y=value, group = section, colour = section)) +
    geom_line(size = 1.5) +  geom_point(size= 3, shape = 21, colour = "black", fill = "white") + coord_fixed(ratio = 3.5) + 
  labs(x = "Self-Reported Condensing Notes", y = "Proportion of Students Condensing Notes") + 
  scale_y_continuous(limits = c(0, 1), breaks = c(0, .2, .4, .6, .8, 1)) + 
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 15)) + 
  scale_colour_grey() +  labs(colour = "Section") 

condenseplot

```


# Flashcards

```{r, echo = F}
# Pre-Post Differences

# basic line graph of pre- and post- spacing
sptab <- subset(ssdata, select=c(pre_ss_flashc, post_ss_flashc, section))

# removing NAs
sptab <- subset(sptab, pre_ss_flashc >=0 & post_ss_flashc >=0)


# Reshaping data using reshape2 library
spmelted <- melt(sptab, id.vars=c("section"))


# labeling variable to say "pre-" and "post"
spmelted$variable <- factor(spmelted$variable, labels = c("Pre", "Post"))


library(Rmisc)
spmelted <- summarySE(spmelted, measurevar = "value", groupvars = c("variable", "section"))




# Line graph of pre-post changes in spacing by section # spaceplot = 
flashcardplot = ggplot(spmelted, aes(x=variable, y=value, group = section, colour = section)) +
    geom_line(size = 1.5) +  geom_point(size= 3, shape = 21, colour = "black", fill = "white") + coord_fixed(ratio = 3.5) + 
  labs(x = "Self-Reported Flashcard Use", y = "Proportion of Students Selecting Flashcards") + 
  scale_y_continuous(limits = c(0, 1), breaks = c(0, .2, .4, .6, .8, 1)) + 
  theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 15)) + 
  scale_colour_grey() +  labs(colour = "Section") 


grid.arrange(rereadplot, flashcardplot, ncol = 2)

g <-grid.arrange(rereadplot, flashcardplot, ncol = 2)

ggsave(file="Y2 reread flashcard.png", width = 8, height = 6, units = "in", g)


```





# SPACING PATTERNS

```{r}
# coding data on 4 profiles
# 3. Maintained Strategy
# 2. Adopted Strategy
# 1. Stoped Utilizing Strategy
# 0. Never Utlized Strategy

ssdata$spacing_cat <- as.numeric(ssdata$spacing_cat)
ssdata$spacing_cat[ssdata$pre_spacing_recode == 1  & ssdata$post_spacing_recode == 1] <- 3
ssdata$spacing_cat[ssdata$pre_spacing_recode == 0 & ssdata$post_spacing_recode == 1] <- 2
ssdata$spacing_cat[ssdata$pre_spacing_recode == 1 & ssdata$post_spacing_recode == 0] <- 1
ssdata$spacing_cat[ssdata$pre_spacing_recode == 0 & ssdata$post_spacing_recode == 0] <- 0


table(ssdata$spacing_cat)

table(ssdata$section_numeric)


ggplot(ssdata, aes(x= as.factor(spacing_cat), y=grade_rec, na.omit = TRUE)) +
  geom_boxplot() + stat_summary(fun.y=mean, geom="point", shape=5, size=4) + 
  labs(x = "Spacing Strategy Adoption", y = "Final Grade", title = "Final Grade by Strat Adoption") +
  scale_y_continuous(limits = c(0, 13), 
                     breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) 
```



## Spacing Patterns Porportions
## (Table 4)
```{r}

# basic line graph of pre- and post- spacing
spacetab <- subset(ssdata, select=c(spacing_cat, section))



# Reshaping data using reshape2 library
spacemelted <- melt(spacetab, id.vars=c("section"))
 

spacemelted$spacing_catfac <- factor(spacemelted$value, 
                             labels = c("Never Used", "Decreased",
                                        "Increased", "Maintained"))




library(Rmisc)
spacemelted <- summarySE(spacemelted, measurevar = "value", 
                      groupvars = c("section", "spacing_catfac"))


# reversing category order
spacemelted$spacing_catfac <- factor(spacemelted$spacing_catfac, 
                           levels = rev(levels(spacemelted$spacing_catfac)))

# obtatining proportions for category by section

spacemelted$n <- spacemelted$N

spacemelted <- spacemelted %>%
  group_by(section, spacing_catfac) %>%
  summarise(n) %>%
  mutate(freq = n / sum(n)*100)



# Line graph of pre-post changes in spacing by section # spaceplot = 

# Version 1
space1 <-  ggplot(spacemelted, aes(x=section, y = freq)) + 
  geom_bar(aes(fill = spacing_catfac), stat = "identity") +
   labs(x = "Section", y = "Percent", fill = "", title = "Spacing Patterns")  +
   scale_fill_manual(values=c("#669966", "#99CC99", "#FF6666", "#CC0000")) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))


space1

ggsave("Y2 Space Categories.png", width = 8, height = 4, units = "in", space1)


spacemelted
table(ssdata$section)

chisq.test(ssdata$section_numeric, ssdata$spacing_cat)
summary(table(ssdata$section_numeric, ssdata$spacing_cat))

```




## Spacing bonferroni
### Intervention vs. Control Section 1
```{r}
sp_i_1 <- subset(ssdata, select = c(section_numeric, spacing_cat))

sp_i_1 <- subset(sp_i_1, section_numeric != 2)
sp_i_1
table(sp_i_1$section_numeric)


# aggregating 
#casting data frame to frequency counts

sp_i_1 <- table(sp_i_1$section_numeric, sp_i_1$spacing_cat)
sp_i_1

sum(sp_i_1)

# labeling 
dimnames(sp_i_1) <- list(section_numeric = c("Intervention", "Control Section 1"), spacing_cat = c("None", "Dec", "Inc", "Maint"))
library(fifer)
chisq.post.hoc(sp_i_1, control = "bonferroni", popsInRows = F)

```



### Intervention vs. Control Section 2
```{r}
sp_i_2 <- subset(ssdata, select = c(section_numeric, spacing_cat))

sp_i_2 <- subset(sp_i_2, section_numeric != 1)
sp_i_2
table(sp_i_2$section_numeric)

sp_i_2 <- table(sp_i_2$section_numeric, sp_i_2$spacing_cat)

# labeling 
dimnames(sp_i_2) <- list(section_numeric = c("Intervention", "Control Section 2"), spacing_cat = c("None", "Dec", "Inc", "Maint"))

chisq.post.hoc(sp_i_2, control = "bonferroni", popsInRows = F)

```





# SELF-TESTING PATTERNS
## (Table 4)
```{r}


# Self-Test

# 3. Maintained Strategy
# 2. Adopted Strategy
# 1. Stoped Utilizing Strategy
# 0. Never Utlized Strategy

ssdata$selftest_cat[ssdata$pre_ss_selftest == 1  & ssdata$post_ss_selftest == 1] <- 3
ssdata$selftest_cat[ssdata$pre_ss_selftest == 0 & ssdata$post_ss_selftest == 1] <- 2
ssdata$selftest_cat[ssdata$pre_ss_selftest == 1 & ssdata$post_ss_selftest == 0] <- 1
ssdata$selftest_cat[ssdata$pre_ss_selftest == 0 & ssdata$post_ss_selftest == 0] <- 0

# basic line graph of pre- and post- spacing
selftesttab <- subset(ssdata, select=c(selftest_cat, section))



# Reshaping data using reshape2 library
selftestmelted <- melt(selftesttab, id.vars=c("section"))
 

selftestmelted$st_catfac <- factor(selftestmelted$value, 
                             labels = c("Never Used", "Decreased",
                                        "Increased", "Maintained"))




library(Rmisc)
selftestmelted <- summarySE(selftestmelted, measurevar = "value", 
                      groupvars = c("section", "st_catfac"))


# reversing category order
selftestmelted$st_catfac <- factor(selftestmelted$st_catfac, 
                           levels = rev(levels(selftestmelted$st_catfac)))

# obtatining proportions for category by section

selftestmelted$n <- selftestmelted$N

selftestmelted<- selftestmelted %>%
  group_by(section, st_catfac) %>%
  summarise(n) %>%
  mutate(freq = n / sum(n)*100)




# Line graph of pre-post changes in spacing by section # spaceplot = 

# Version 1

st1 <- ggplot(selftestmelted, aes(x=section, y = freq)) + 
  geom_bar(aes(fill = st_catfac), stat = "identity") +
   labs(x = "Section", y = "Percent", fill = "", 
        title = "Self-Testing Patterns") +
   scale_fill_manual(values=c("#669966", "#99CC99", "#FF6666", "#CC0000")) +
    theme(axis.text.x = element_text(angle = 30, hjust = 1))



st1

#Spacing and Self-Test Proportion Chart
g <-grid.arrange(space1, st1, ncol = 2)
ggsave("Y2 Space & Self-Test Categories.png", width = 8, height = 4, units = "in", g)


```


## Self-testing bonferroni
### Intervention vs. Control Section 1
```{r}

st_i_1 <- subset(ssdata, select = c(section_numeric, selftest_cat))

st_i_1 <- subset(st_i_1, section_numeric != 2)

st_i_1

table(st_i_1$section_numeric)

st_i_1 <- table(st_i_1$section_numeric, st_i_1$selftest_cat)

st_i_1
sum(st_i_1)

# labeling 
dimnames(st_i_1) <- list(section_numeric = c("Intervention", "Control Section 1"), selftest_cat = c("None", "Dec", "Inc", "Maint"))

chisq.post.hoc(st_i_1, control = "bonferroni", popsInRows = F)

```

## Self-testing bonferroni
### Intervention vs. Control Section 2
```{r}
st_i_2 <- subset(ssdata, select = c(section_numeric, selftest_cat))

st_i_2 <- subset(st_i_2, section_numeric != 1)
st_i_2
table(st_i_2$section_numeric)

st_i_2 <- table(st_i_2$section_numeric, st_i_2$selftest_cat)
st_i_2

# labeling 
dimnames(st_i_2) <- list(section_numeric = c("Intervention", "Control Section 2"), selftest_cat = c("None", "Dec", "Inc", "Maint"))

chisq.post.hoc(st_i_2, control = "bonferroni", popsInRows = F)

```


## URM Analysis
```{r}
# getting graph of proportions
# Version 3 - dodged
table(ssdata$urm)
ssdata$urm <- factor(ssdata$urm, labels = c("non-URM", "URM"))


urmfreq <- ggplot(ssdata, aes(x=section)) + 
  geom_bar(aes(fill = urm)) +  theme(axis.text = element_text(size = 12)) +
  labs(x = NULL, y = "Count", fill = "URM Status") +
  scale_y_continuous(limits = c(0, 350)) + 
  scale_fill_manual(values = c("#EB6F00", "#A9A9A9"))

urmfreq

ggsave("Y2 URM Freq.png", width = 4, height = 5, units = "in", urmfreq)

table(ssdata$urm)

t.test(ssdata$grade_rec~ssdata$urm)
describeBy(ssdata$grade_rec, ssdata$urm)

ssdata$post_ss_selftest
```

## URM Analysis 3 x 2 x 2 Model
```{r}
# 3 x 2 x 2 Model 
yearsfit <- aov(grade_rec~section*urm*post_ss_selftest, data = ssdata)
summary(yearsfit)
TukeyHSD(yearsfit) # post-hoc test
confint(yearsfit)


ssdata$post_ss_selftest <- factor(ssdata$post_ss_selftest,
                     labels = c("Other Strat", "Self-Test"))


# descriptives for urm vs. non-urm self-testers
table(ssdata$post_ss_selftest)
st_urm <- subset(ssdata, post_ss_selftest == "Self-Test")
table(st_urm$urm, exclude = NULL)
describeBy(st_urm$grade_rec, st_urm$urm)

# descriptives for urm vs. non-urm students who didn't self test
table(ssdata$post_ss_selftest)
st_urm <- subset(ssdata, post_ss_selftest == "Other Strat")
table(st_urm$urm, exclude = NULL)
describeBy(st_urm$grade_rec, st_urm$urm)

```

# URM Analysis (Fig 4)
```{r}
# Final Grade by URM Status

pd <- position_dodge(0.75)

ggplot(ssdata, aes(x = urm, y = grade_rec, na.omit = TRUE)) + geom_boxplot() +
  stat_summary(fun.y=mean, geom="point", shape=5, size=4, position = pd) +
  labs(x = "URM Status", y = "Final Grade", title = "Final Grade by URM Status") + scale_y_continuous(limits = c(0, 13), breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) + scale_fill_manual(values = c("#EB6F00", "#A9A9A9")) 




# Grades by Section and URM status
ggplot(ssdata, aes(x = section, y = grade_rec, fill = urm, na.omit = TRUE)) + geom_boxplot() +
  stat_summary(fun.y=mean, geom="point", shape=5, size=4, position = pd) +
  labs(x = "Section", y = "Final Grade", title = "Final Grade by Section and URM Status") + scale_y_continuous(limits = c(0, 13), breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) + 
  scale_fill_manual(values = c("#EB6F00", "#A9A9A9"))


# Grades by self-test

ggplot(ssdata, aes(x = post_ss_selftest, y = grade_rec, na.omit = TRUE)) + geom_boxplot() +
  stat_summary(fun.y=mean, geom="point", shape=5, size=4, position = pd) +
  labs(x = "URM Status", y = "Final Grade", title = "Final Grade by URM Status") + scale_y_continuous(limits = c(0, 13), breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13))


# self-test x urm

ggplot(ssdata, aes(x = post_ss_selftest, y = grade_rec, fill = urm, na.omit = TRUE)) + geom_boxplot() + 
  stat_summary(fun.y=mean, geom="point", shape=5, size=4, position = pd) + 
  labs(x = "URM Status", y = "Final Grade") + scale_y_continuous(limits = c(0, 13), breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) + scale_fill_discrete(name= NULL) +
  theme(legend.text = element_text(size = 14), axis.title = element_text(size=14), 
        axis.text.x = element_text(size = 12))


twoway <- ggplot(ssdata, aes(x = post_ss_selftest, y = grade_rec, fill = urm, na.omit = TRUE)) + geom_boxplot() + 
  stat_summary(fun.y=mean, geom="point", shape=5, size=4, position = pd) + 
  labs(x = NULL, y = "Final Grade") + scale_y_continuous(limits = c(0, 13), breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)) + scale_fill_discrete(name= NULL) +
  theme(legend.text = element_text(size = 14), axis.title = element_text(size=14), 
        axis.text.x = element_text(size = 12))

twoway

ggsave("Y2 URM Selftest Boxplot.png", width = 6, height = 4, units = "in", twoway)


```




## Regression Model
## (Table 6)
Model with OIR data (age, gender, years, urm, sat)
```{r}
# with urm

ssdata$sat_zscore <- scale(ssdata$sattotalscore, center = T, scale = T)

describe(ssdata$sat_zscore)  

ssdata$spacing_cat.f <- factor(ssdata$spacing_cat)
ssdata$selftest_cat.f <- factor(ssdata$selftest_cat)
ssdata$section_numeric.f <- factor(ssdata$section_numeric)

results = lm(grade_rec ~  spacing_cat.f + selftest_cat.f + post_ss_condense + post_ss_flashc + section_numeric.f + years_rec +
               gender_rec2 + urm + sat_zscore, data = ssdata)

results
summary(results)
confint(results, level = .95)

```






